In [1]:
%pylab inline
plt.xkcd()
import pandas
from collections import Counter
import string
from IPython.core.display import HTML, Image
def display_counter(most_common, count=5, fmt=repr):    
    df = pandas.DataFrame([(fmt(char), count) for char, count in most_common], columns=['Character', 'Count'])
    return HTML(df.to_html(index=False))
width = 0.7


Populating the interactive namespace from numpy and matplotlib

What is the Most Used Character in Python 3?

Using Python - obviously

Load up the standard library ...


In [2]:
from collections import Counter
import os
PYTHON_ROOT = '/usr/lib/python3.4/'
symbol_counter = Counter()
for dir_path, dir_names, files in os.walk(PYTHON_ROOT):
    for filename in files:
        if filename.endswith('.py'):
            with open(os.path.join(dir_path, filename), 'r') as f:
                symbol_counter.update(f.read())

And the Most Common Character Is


In [3]:
all_chars = symbol_counter.most_common()
display_counter(symbol_counter.most_common(5))


Out[3]:
Character Count
' ' 2926016
'e' 676323
't' 432209
's' 369292
'r' 347924

Ok, ignoring those boring ones ...


In [4]:
BORING_CHARACTERS = string.ascii_letters + ' ' + '\n'
interesting_counter = symbol_counter.copy()
for character in BORING_CHARACTERS:
    interesting_counter[character] = 0
interesting_chars = (+interesting_counter).most_common()
interesting_chars_dict = dict(interesting_chars)
display_counter(interesting_chars[:7], fmt=str)


Out[4]:
Character Count
_ 129900
. 118813
' 118319
, 100226
) 96055
( 96002
0 94189

What does that look like as a graph?


In [5]:
fig = plt.figure(figsize=(14, 6))
x_chars, y_counts = list(zip(*interesting_chars[:12]))
indexes = np.arange(len(x_chars))
plt.bar(indexes, y_counts, width, color='#669DC7', figure=fig)
plt.ylabel('Counts', fontsize=20)
plt.title('Most common characters', fontsize=30)
plt.xticks(indexes+width/2., x_chars, fontsize=50)
plt.yticks(fontsize=25);


This is why programming is so hard


In [6]:
Image("keyboard_punctuation_keys.png")


Out[6]:

What About the Numbers?


In [7]:
fig = plt.figure(figsize=(14, 5));plt.hold('on')
indexes = np.arange(10)
x_nums = list(map(str, indexes))
y_counts = [interesting_chars_dict[num] for num in x_nums]
plt.bar(indexes, y_counts, width, color='#669DC7', figure=fig)
plt.plot(indexes+width/2., np.log10(1 + 1 / indexes) * sum(y_counts[1:]))
plt.ylabel('Counts', fontsize=20)
plt.title('Is this your number?', fontsize=30)
plt.xticks(indexes+width/2., x_nums, fontsize=50)
plt.yticks(fontsize=25);


The line follows Benford's Law - Python sort of doesn't

So ... Other Interesting Factoids

How many tabs in the Python Standard Library?


In [8]:
interesting_chars_dict['\t']


Out[8]:
4

How many non-ASCII characters


In [9]:
non_ascii = set(interesting_chars_dict) - set(map(chr, range(0, 127)))
len(non_ascii)


Out[9]:
94

In [10]:
print(*non_ascii)


ι Ë ç Ï κ Ì Ò µ Ô Þ Ã Ç ô Û à φ β ϑ á Í ſ Â ì ẛ û ê Ý ò ë Ñ ſt ΰ ü Á ã ι ù ΐ Ó ø ϰ Æ ñ À É Ù ö ÿ Õ ß Ê Ä õ ΐ μ π θ È ΰ ý Ø σ æ ε ó ϐ Î þ å ä î ϵ ϕ í Å st Ð ϖ ϱ Ú Ö è   ı é ð â ρ Ł Ü ï ς ú ṡ

Thank You

Thank you to

  • Thanks to Grant for comming up with the idea
  • XKCD
  • MatPlotLib for having XKCD-style plots
  • Humor Sans

Dependancies

  • Python 3
  • IPython Notebook
  • MatPlotLib
  • Pandas

In [11]:
for dir_path, dir_names, files in os.walk(PYTHON_ROOT):
    for filename in files:
        if filename.endswith('.py'):
            with open(os.path.join(dir_path, filename), 'r') as f:
                file_content = f.read()
                if '\t' in file_content:
                    print(os.path.join(dir_path, filename))
                    for line_no, line in enumerate(file_content.splitlines()):
                        if '\t' in line:
                            print(line_no, ":", repr(line))


/usr/lib/python3.4/gettext.py
383 : '\t# first look into the standard locale dir, then into the '
384 : '\t# langpack locale dir'
386 : '\t# standard mo file'
393 : '\t# langpack mofile -> use it'

The whole thing ...


In [13]:
fig = plt.figure(figsize=(4*8,3*8))
## Skip space(idx: 0) and newline(idx: 9)
x_chars, y_counts = list(zip(*(all_chars[1:9] + all_chars[10:97])))
indexes = np.arange(len(x_chars))
plt.bar(indexes, y_counts, width, color='#669DC7', figure=fig)
plt.ylabel('Counts', fontsize=20)
plt.title('Most common characters', fontsize=30)
plt.xticks(indexes+width/2., x_chars, fontsize=30)
plt.yticks(fontsize=25);
plt.xlim(min(indexes), max(indexes))
plt.yscale('log', nonposy='clip')